from PIL import Image
from IPython.display import display
img = Image.open('alien.png')
display(img)

import sqlalchemy

sqlalchemy.create_engine('mysql://root:**********@localhost/ian_k')

Engine(mysql://root:***@localhost/ian_k)

%load_ext sql

%sql mysql://root:**********@localhost/ian_k

aliens = %sql select * from aliens

 * mysql://root:***@localhost/ian_k
50000 rows affected.

details = %sql select * from details

 * mysql://root:***@localhost/ian_k
50000 rows affected.

location = %sql select * from location

 * mysql://root:***@localhost/ian_k
50000 rows affected.

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from fuzzywuzzy import fuzz

aliens = pd.DataFrame(aliens)
aliens.head()

aliens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          50000 non-null  int64 
 1   first_name  50000 non-null  object
 2   last_name   50000 non-null  object
 3   email       50000 non-null  object
 4   gender      50000 non-null  object
 5   type        50000 non-null  object
 6   birth_year  50000 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.7+ MB

details = pd.DataFrame(details)
details.rename(columns={'detail_id': 'id'}, inplace=True)
details.head()

details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 50000 non-null  int64 
 1   favorite_food      50000 non-null  object
 2   feeding_frequency  50000 non-null  object
 3   aggressive         50000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB

location = pd.DataFrame(location)
location.rename(columns={'loc_id': 'id'}, inplace=True)
location.head()

location.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                50000 non-null  int64 
 1   current_location  50000 non-null  object
 2   state             50000 non-null  object
 3   country           50000 non-null  object
 4   occupation        50000 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB

alien_info = aliens.merge(details, on='id').merge(location, on='id')
alien_info.head()

usa = gpd.read_file('tl_2014_us_state.shx')[['NAME', 'geometry']]
usa.rename(columns={'NAME': 'state'}, inplace=True)
usa.head()

alien_states = usa[usa['state'].isin(location['state'].unique())]
alien_states.head()
# type(alien_states)

# alien_population = alien_info.groupby('state')['state'].count()
alien_population = alien_info['state'].value_counts()
alien_population[:5]

state
Texas         5413
California    5410
Florida       4176
New York      2690
Ohio          1851
Name: count, dtype: int64

alien_population = gpd.GeoDataFrame(alien_population).rename(columns={'count': 'population'}).reset_index()
alien_population.head()

pd.DataFrame(alien_population).plot.bar(x='state', y='population', figsize=(12, 4), color='#844593')
plt.show()

alien_population_geo = alien_population.merge(alien_states, on='state')
alien_population_geo = gpd.GeoDataFrame(alien_population_geo)
alien_population_geo.head()
# type(alien_population_geo)

ax = alien_population_geo.plot(edgecolor='grey', figsize=(20, 20))
alien_population_geo.plot(ax=ax, column='population', cmap='BuPu')
plt.title('Alien population per state (the darker the more)')
plt.show()

d1 = alien_info[alien_info['state'].isin(['Texas', 'California', 'Florida', 'New York'])][['state', 'type']]
d1.head()

for state in d1['state'].unique():
    max_population = d1[d1['state'] == state].groupby('type')['type'].count().sort_values()
    print(f"Widest spieces in {state} is {max_population.tail(1).index[0]} ({max_population.max()})")
    # print(max_population)

Widest spieces in California is Green (1105)
Widest spieces in Florida is Flatwoods (878)
Widest spieces in Texas is Reptile (1128)
Widest spieces in New York is Nordic (583)

population_per_state = pd.read_csv('us_pop_by_state.csv')[['state', '2020_census']]
population_per_state.head()

population_per_state = population_per_state[population_per_state['state'].isin(location['state'].unique())]
population_per_state.head()

def percentage(df):
    return round(df['population'] / df['2020_census'] * 100, 4)

alien_percentage = alien_population.merge(population_per_state, on='state')
alien_percentage['%_of_population'] = alien_percentage.apply(percentage, axis=1)
alien_percentage.head()

labels = ['Aliens', 'Humans']
sizes = [sum(alien_percentage['population']), sum(alien_percentage['2020_census'])]
colors = ['#4C014B', '#844593']

fig, ax = plt.subplots(figsize=(3, 3))
ax.pie(sizes, labels=labels, colors=colors, explode=[0.5, 0])
plt.show()

labels = ['Aliens', 'Humans']
sizes = [sum(alien_percentage['population'][:4]), sum(alien_percentage['2020_census'][:4])]
colors = ['#4C014B', '#844593']

fig, ax = plt.subplots(figsize=(3, 3))
ax.pie(sizes, labels=labels, colors=colors, explode=[0.5, 0])
plt.show()

labels = ['Aggressive', 'Non-agressive']
sizes = [sum(details['aggressive'] == 'TRUE'), len(details['aggressive'])]
colors = ['#844593', '#4C014B']

fig, ax = plt.subplots(figsize=(3, 3))
ax.pie(sizes, labels=labels, colors=colors)
plt.show()

'human' in details['favorite_food'].unique()

False

# !pip install fuzzywuzzy

# !pip install python-Levenshtein

alien_occupation = location['occupation'].unique()
alien_occupation[:10]

array(['Senior Cost Accountant', 'Senior Sales Associate',
       'Registered Nurse', 'Director of Sales', 'Administrative Officer',
       'Programmer II', 'Pharmacist', 'Recruiter', 'Design Engineer',
       'Research Associate'], dtype=object)

np.nan in alien_occupation

False

len(alien_occupation)

195

usa_professions = pd.read_csv(r'usa_professions.csv')[['PUMS Occupation', 'Average Wage (2020)']].dropna()
usa_professions.head()

max_salary = usa_professions['Average Wage (2020)'].max()
max_salary

266984.015114905

def power_coefficient(df):
    return round(df['Average Wage (2020)'] / 200000 * 3.6)

usa_professions['power_coefficient'] = usa_professions.apply(power_coefficient, axis=1)
usa_professions = usa_professions[['PUMS Occupation', 'power_coefficient']]
usa_professions.head()

pairs = []
for p in alien_occupation:
    for q in usa_professions['PUMS Occupation']:
        pairs.append([p, q])
pairs[:10]

[['Senior Cost Accountant', 'General & operations managers'],
 ['Senior Cost Accountant', 'Chief executives & legislators'],
 ['Senior Cost Accountant', 'Advertising & promotions managers'],
 ['Senior Cost Accountant', 'Marketing managers'],
 ['Senior Cost Accountant', 'Sales managers'],
 ['Senior Cost Accountant', 'Public relations and fundraising managers'],
 ['Senior Cost Accountant', 'Administrative services managers'],
 ['Senior Cost Accountant', 'Facilities managers'],
 ['Senior Cost Accountant', 'Computer & information systems managers'],
 ['Senior Cost Accountant', 'Financial managers']]

matched = []
for pair in pairs:
    match_ratio = fuzz.partial_ratio(*pair)
    if match_ratio >= 70:
        matched.append(pair)
matched[:10]

[['Registered Nurse', 'Registered nurses'],
 ['Administrative Officer', 'Administrative services managers'],
 ['Programmer II', 'Computer programmers'],
 ['Programmer II',
  'Computer numerically controlled tool operators and programmers'],
 ['Pharmacist', 'Pharmacists'],
 ['Pharmacist', 'Pharmacy technicians'],
 ['Pharmacist', 'Pharmacy aides'],
 ['Design Engineer', 'Sales engineers'],
 ['Research Associate',
  'Miscellaneous life, physical, & social science technicians, including social science research assistants'],
 ['Staff Scientist',
  'Environmental scientists and specialists, including health']]

len(matched)

362

matched_professions = pd.DataFrame(matched, columns=['occupation', 'PUMS Occupation'])
matched_professions.head()

alien_occupation_and_power = location[['occupation']]\
                             .merge(matched_professions, on='occupation', how='left')\
                             .merge(usa_professions, on='PUMS Occupation', how='left').dropna()
alien_occupation_and_power.head()

alien_power = alien_occupation_and_power.groupby('occupation')['power_coefficient'].mean().reset_index()
alien_power.head()

graph = sns.scatterplot(x='occupation', y='power_coefficient', data=alien_power, color='#844593')
graph.set(xticklabels=[])
plt.show()

alien_power.describe()

print(f"Alien power varies from {round(alien_power['power_coefficient'].mean() - alien_power['power_coefficient'].std())} \
to {round(alien_power['power_coefficient'].mean() + alien_power['power_coefficient'].std())}")

Alien power varies from 1 to 2

	id	first_name	last_name	email	gender	type	birth_year
0	1	Tyrus	Wrey	twrey0@sakura.ne.jp	Agender	Reptile	1717
1	2	Ealasaid	St Louis	estlouis1@amazon.co.uk	Female	Flatwoods	1673
2	3	Violette	Sawood	vsawood2@yolasite.com	Female	Nordic	1675
3	4	Rowan	Saintsbury	rsaintsbury3@rediff.com	Male	Green	1731
4	5	Free	Ingolotti	fingolotti4@bbb.org	Genderfluid	Flatwoods	1763

	id	favorite_food	feeding_frequency	aggressive
0	1	White-faced tree rat	Weekly	TRUE
1	2	Lizard, goanna	Seldom	FALSE
2	3	Indian red admiral	Weekly	TRUE
3	4	Bandicoot, southern brown	Often	FALSE
4	5	Kangaroo, red	Once	FALSE

	id	current_location	state	country	occupation
0	1	Cincinnati	Ohio	United States	Senior Cost Accountant
1	2	Bethesda	Maryland	United States	Senior Sales Associate
2	3	Oakland	California	United States	Registered Nurse
3	4	Richmond	Virginia	United States	Director of Sales
4	5	Atlanta	Georgia	United States	Administrative Officer

	id	first_name	last_name	email	gender	type	birth_year	favorite_food	feeding_frequency	aggressive	current_location	state	country	occupation
0	1	Tyrus	Wrey	twrey0@sakura.ne.jp	Agender	Reptile	1717	White-faced tree rat	Weekly	TRUE	Cincinnati	Ohio	United States	Senior Cost Accountant
1	2	Ealasaid	St Louis	estlouis1@amazon.co.uk	Female	Flatwoods	1673	Lizard, goanna	Seldom	FALSE	Bethesda	Maryland	United States	Senior Sales Associate
2	3	Violette	Sawood	vsawood2@yolasite.com	Female	Nordic	1675	Indian red admiral	Weekly	TRUE	Oakland	California	United States	Registered Nurse
3	4	Rowan	Saintsbury	rsaintsbury3@rediff.com	Male	Green	1731	Bandicoot, southern brown	Often	FALSE	Richmond	Virginia	United States	Director of Sales
4	5	Free	Ingolotti	fingolotti4@bbb.org	Genderfluid	Flatwoods	1763	Kangaroo, red	Once	FALSE	Atlanta	Georgia	United States	Administrative Officer

	state	geometry
0	West Virginia	POLYGON ((-81.74725 39.09538, -81.74635 39.096...
1	Florida	MULTIPOLYGON (((-82.98339 24.60263, -82.98624 ...
2	Illinois	POLYGON ((-91.18529 40.63780, -91.17510 40.643...
3	Minnesota	POLYGON ((-96.78438 46.63050, -96.78434 46.630...
4	Maryland	POLYGON ((-77.45881 39.22027, -77.45866 39.220...

Hipotezė¶

Duomenų paėmimas iš serverio¶

Pirminis duomenų tvarkymas ¶

1. Ar jų gyvenamoji vieta pritaikyta jų rūšiai, dėl to nekelia jiems streso?¶

2. Ar jų koncentracija valstijose nėra kritinė?¶

3. Ar plėšrios rūšys neminta žmonėmis?¶

4. Ar jų profesijos nedaro akyvaizdžiai neigiamos įtakos individams arba visuomenei?¶

Galutinė išvada:¶

	state	population	geometry
0	Texas	5413	POLYGON ((-103.98018 32.00012, -103.97994 32.0...
1	California	5410	MULTIPOLYGON (((-119.63472 33.26544, -119.6363...
2	Florida	4176	MULTIPOLYGON (((-82.98339 24.60263, -82.98624 ...
3	New York	2690	MULTIPOLYGON (((-74.04657 40.68964, -74.04675 ...
4	Ohio	1851	POLYGON ((-84.80325 40.98939, -84.80324 40.991...

	state	2020_census
0	California	39538223
1	Texas	29145505
2	Florida	21538187
3	New York	20201249
4	Pennsylvania	13002700

	PUMS Occupation	Average Wage (2020)
0	General & operations managers	84813.100671
1	Chief executives & legislators	156122.416454
2	Advertising & promotions managers	78689.732180
3	Marketing managers	89221.844910
4	Sales managers	105004.383049

	occupation	PUMS Occupation
0	Registered Nurse	Registered nurses
1	Administrative Officer	Administrative services managers
2	Programmer II	Computer programmers
3	Programmer II	Computer numerically controlled tool operators...
4	Pharmacist	Pharmacists

	occupation	power_coefficient
0	Account Coordinator	1.0
1	Accountant I	1.0
2	Accountant II	1.0
3	Accountant III	1.0
4	Accountant IV	1.0

	power_coefficient
count	117.000000
mean	1.109285
std	0.504047
min	0.000000
25%	1.000000
50%	1.000000
75%	1.250000
max	3.000000